
########################################################################
########################################################################
## make family-corp-product crosswalk 
## 03 Mar 2015
########################################################################
########################################################################


## read in data

  ## fam-biz matches
fb1 <- read.csv("01_Data/02_Clean/top_0911_cpi_own.csv")
fb2 <- read.csv("01_Data/02_Clean/own_clean.csv")
fb3 <- read.csv('01_Data/01_Raw/03_Family/own_orig.csv')

  ## market shares
bh <- read.csv("01_Data/02_Clean/agemar_ha.csv")

  ## supply prices
pri <- read.csv('01_Data/02_Clean/prices_all_month.csv')


## make bh hscode into four digits
bh$hs_four <- sprintf("%04d", bh$hs_four)
bh$hs_four <- strtrim(bh$hs_four, 4)


#####
## prep business - hscode match using agemar data
#####

## get rid of earthquake year
bh <- subset(bh, bh$year!=2010, select = -c(X))

setnames(bh, 'hs_four', 'hs_4')

## aggregate by consignee and hscode
bh <- aggregate(cbind(qty, wgt_kg) ~ con_final + hs_4, data = bh,
                FUN = function(x) mean(x, na.rm=T))

## make measures of cumulative quantity and weight
bh <- bh[order(bh$hs_4, -bh$qty),]
bh$qty_cum <- unlist(tapply(bh$qty, bh$hs_4, FUN = function(x) cumsum(x)/sum(x))) 
bh$qty_pct <- unlist(tapply(bh$qty, bh$hs_4, FUN = function(x) x/sum(x))) 
bh <- bh[order(bh$hs_4, -bh$wgt_kg),]
bh$wgt_cum <- unlist(tapply(bh$wgt_kg, bh$hs_4, FUN = function(x) cumsum(x)/sum(x))) 
bh$wgt_pct <- unlist(tapply(bh$wgt_kg, bh$hs_4, FUN = function(x) x/sum(x))) 

## drop importers who bring in the last 10% of each good
bh$drop <- ifelse(bh$wgt_cum - bh$wgt_pct > 0.9, 1, 0)
table(bh$drop)
bh <- subset(bh, bh$drop==0)



#####
## prep family - business match
#####

fb1 <- subset(fb1, select = c(con_final, fam))
fb2 <- subset(fb2, select = c(comp_final, fam))
setnames(fb2, 'comp_final', 'con_final')


## make fb1 long
temp <- strsplit(fb1$fam, "; ")
ncol = max(sapply(temp,length))
temp <- as.data.table(lapply(1:ncol, FUN = function(x) sapply(temp,"[",x)))
setnames(temp, paste0("V",seq(1:ncol)), paste0('fam_',seq(1,ncol)))

fb1 <- cbind.data.frame('con_final' = fb1[,1], temp)

fb1 <- reshape(fb1, dir = "long", varying = c(2:8), sep = "_")
fb1 <- subset(fb1, select = c(con_final, fam), is.na(fb1$fam)==F)


fb <- smartbind(fb1, fb2, fb3)
fb <- unique(fb)

## make var for number of fams per company

temp <- aggregate(fam ~ con_final, data = fb, FUN = function(x) length(unique(x)))
setnames(temp, 'fam', 'nown')

fb <- merge(fb, temp, by = 'con_final', all.x = T)


#####
## merge fam-biz and biz-prod
#####

dat <- merge(bh, fb, by = "con_final", all = T)


## change mis-spelled names in the fam data

dat$fam <- ifelse(dat$fam=="DE LA FUENTE", "DELAFUENTE", dat$fam)
dat$fam <- ifelse(dat$fam=="D'ADESKY", "DADESKY", dat$fam)
dat$fam <- ifelse(dat$fam=="COQUILLON", "COQUILLEAU", dat$fam)
dat$fam <- ifelse(dat$fam=="BOUGUIGNON", "BOURGUIGNOL", dat$fam)
dat$fam <- ifelse(dat$fam=="GUILLIOT", "GUILLOT", dat$fam)
dat$fam <- ifelse(dat$fam=="SAINT FORT", "ST-FORT", dat$fam)
dat$fam <- ifelse(dat$fam=="ST CLAIRE", "SAINT CLAIR", dat$fam)
dat$fam <- ifelse(dat$fam=="ST-FELIX", "SAINT FELIX", dat$fam)
dat$fam <- ifelse(dat$fam=="ROSSEAU", "ROUSSEAU", dat$fam)
dat$fam <- ifelse(dat$fam=="O'BRIEN", "OBRIEN", dat$fam)
dat$fam <- ifelse(dat$fam=="MORISET", "MORISSET", dat$fam)
dat$fam <- ifelse(dat$fam=="MONDERSIR", "MONDESIR", dat$fam)
dat$fam <- ifelse(dat$fam=="MONBRUN", "MOMBRUN", dat$fam)
dat$fam <- ifelse(dat$fam=="MILFORD", "MILFORT", dat$fam)
dat$fam <- ifelse(dat$fam=="MAZOUKA", "MARZOUKA", dat$fam)
dat$fam <- ifelse(dat$fam=="MARA", "MARRA", dat$fam)
dat$fam <- ifelse(dat$fam=="LOUIS JEUNE", "LOUIS-JEUNE", dat$fam)
dat$fam <- ifelse(dat$fam=="LE CORPS", "LECORPS", dat$fam)
dat$fam <- ifelse(dat$fam=="LAFERIERE", "LAFERRIERE", dat$fam)
dat$fam <- ifelse(dat$fam=="JEANOT", "JEANNOT", dat$fam)
dat$fam <- ifelse(dat$fam=="JEAN BARD", "JEAN-BART", dat$fam)
dat$fam <- ifelse(dat$fam=="FILSAIME", "FILS-AIME", dat$fam)
dat$fam <- ifelse(dat$fam=="EDMON", "EDMOND", dat$fam)
dat$fam <- ifelse(dat$fam=="DOCE", "DOLCE", dat$fam)
dat$fam <- ifelse(dat$fam=="CHAMPON", "CHAMPION", dat$fam)
dat$fam <- ifelse(dat$fam=="BOURGUIGNON", "BOURGUIGNOL", dat$fam)
dat$fam <- ifelse(dat$fam=="BOUGUIGNON", "BOURGUIGNOL", dat$fam)
dat$fam <- ifelse(dat$fam=="BIJOUX", "BIJOU", dat$fam)
dat$fam <- ifelse(dat$fam=="BEHRMANN", "BERHMANN", dat$fam)
dat$fam <- ifelse(dat$fam=="AUXILA", "AUXILLA", dat$fam)
dat$fam <- ifelse(dat$fam=="CROICY", "CROISSY", dat$fam)
dat$fam <- ifelse(dat$fam=="DESTIN", "DESTINE", dat$fam)
dat$fam <- ifelse(dat$fam=="SAINT FELIX", "SAINT-FELIX", dat$fam)
dat$fam <- ifelse(dat$fam=="PAUTYNSKY", "PAUTENSKI", dat$fam)
dat$fam <- ifelse(dat$fam=="HILEILE", "HILLEL", dat$fam)


#####
## merge in supply prices to calculate value
#####

pri$hs_four <- sprintf("%04d", pri$hs_four)
pri <- merge(pri, reg, by = c('year', 'month'), all.x = T)
pri$autoc <- car::recode(pri$regime, "c('d1','d2','d3')=0; c('n1','n2')=1")

pri <- data.table(pri)
pri[, price_wo_0212 := median(price_wo, na.rm=T), by = hs_four ]
pri$price_wo_02 <- ifelse(pri$year==2002, pri$price_wo, NA)
pri[, price_wo_02 := median(price_wo_02, na.rm=T), by = hs_four ]
pri$price_wo_autoc <- ifelse(pri$autoc==1, pri$price_wo, NA)
pri[, price_wo_autoc := median(price_wo_autoc, na.rm=T), by = hs_four ]
pri$price_ha_02 <- ifelse(pri$year==2002, pri$price_ha, NA)
pri[, price_ha_02 := median(price_ha_02, na.rm=T), by = hs_four ]

pri <- unique(subset(pri, select = c(hs_four, price_wo_0212, price_wo_02, price_wo_autoc, price_ha_02)))

dat <- merge(dat, pri, by.x = 'hs_4', by.y = 'hs_four', all.x = T)

## create value vars
dat$value_1 = dat$qty * dat$price_wo_02 
dat$value_wo_02 = dat$qty * dat$price_wo_02 / dat$nown
dat$value_ha_02 = dat$qty * dat$price_ha_02 / dat$nown
dat$value_wo_0212 = dat$qty * dat$price_wo_0212 / dat$nown
dat$value_wo_autoc = dat$qty * dat$price_wo_autoc / dat$nown
dat$value_bin = ifelse(dat$qty>0, 1, 0) * dat$price_wo_02
dat$value_bin0212 = ifelse(dat$qty>0, 1, 0) * dat$price_wo_0212
dat$value_binautoc = ifelse(dat$qty>0, 1, 0) * dat$price_wo_autoc
dat$value_log = log(dat$value_wo_02/dat$nown+1)
dat$value_log0212 = log(dat$value_wo_0212/dat$nown+1)
dat$value_logautoc = log(dat$value_wo_autoc/dat$nown+1)


## write to dta
write.dta(dat, '01_Data/02_Clean/fam_biz_prod.dta')
